Y = B0 + B1 X + E
Assumptions : E = these errors are independent, normal with mean 0 and common variance squared(sigma)
# Importing the dataset
dataset = read.csv("DATA/Salary_Data.csv")
print(dataset)
## YearsExperience Salary
## 1 1.1 39343
## 2 1.3 46205
## 3 1.5 37731
## 4 2.0 43525
## 5 2.2 39891
## 6 2.9 56642
## 7 3.0 60150
## 8 3.2 54445
## 9 3.2 64445
## 10 3.7 57189
## 11 3.9 63218
## 12 4.0 55794
## 13 4.0 56957
## 14 4.1 57081
## 15 4.5 61111
## 16 4.9 67938
## 17 5.1 66029
## 18 5.3 83088
## 19 5.9 81363
## 20 6.0 93940
## 21 6.8 91738
## 22 7.1 98273
## 23 7.9 101302
## 24 8.2 113812
## 25 8.7 109431
## 26 9.0 105582
## 27 9.5 116969
## 28 9.6 112635
## 29 10.3 122391
## 30 10.5 121872
Splitting the dataset into the Training set and Test set
Reminder :
# install.packages('caTools')
library(caTools)
set.seed(123)
# Splitting on the dependent variable : to have well distributed values of the dependent variable in the training and test sets.
split = sample.split(dataset$Salary, SplitRatio = 2/3)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)
# Echantillon d'apprentissage
print(training_set)
## YearsExperience Salary
## 1 1.1 39343
## 3 1.5 37731
## 6 2.9 56642
## 7 3.0 60150
## 9 3.2 64445
## 10 3.7 57189
## 12 4.0 55794
## 13 4.0 56957
## 14 4.1 57081
## 15 4.5 61111
## 17 5.1 66029
## 18 5.3 83088
## 19 5.9 81363
## 22 7.1 98273
## 23 7.9 101302
## 25 8.7 109431
## 27 9.5 116969
## 28 9.6 112635
## 29 10.3 122391
## 30 10.5 121872
# Echantillon test
print(test_set)
## YearsExperience Salary
## 2 1.3 46205
## 4 2.0 43525
## 5 2.2 39891
## 8 3.2 54445
## 11 3.9 63218
## 16 4.9 67938
## 20 6.0 93940
## 21 6.8 91738
## 24 8.2 113812
## 26 9.0 105582
Fitting Simple Linear Regression to the Training set
regressor = lm(formula = Salary ~ YearsExperience,
data = training_set)
summary(regressor)
##
## Call:
## lm(formula = Salary ~ YearsExperience, data = training_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7325.1 -3814.4 427.7 3559.7 8884.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25592 2646 9.672 1.49e-08 ***
## YearsExperience 9365 421 22.245 1.52e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5391 on 18 degrees of freedom
## Multiple R-squared: 0.9649, Adjusted R-squared: 0.963
## F-statistic: 494.8 on 1 and 18 DF, p-value: 1.524e-14
Predicting the Test set results
y_pred = predict(regressor, newdata = test_set)
print(y_pred)
## 2 4 5 8 11 16 20 21
## 37766.77 44322.33 46195.35 55560.43 62115.99 71481.07 81782.66 89274.72
## 24 26
## 102385.84 109877.90
# Visualising the Training set results
# install.packages("rlang")
# install.packages("ggplot2")
library(ggplot2)
ggplot() +
geom_point(aes(x = training_set$YearsExperience, y = training_set$Salary),
colour = 'red') +
geom_line(aes(x = training_set$YearsExperience, y = predict(regressor, newdata = training_set)),
colour = 'blue') +
ggtitle('Salary vs Experience (Training set)') +
xlab('Years of experience') +
ylab('Salary')
# Visualising the Test set results
library(ggplot2)
ggplot() +
geom_point(aes(x = test_set$YearsExperience, y = test_set$Salary),
colour = 'red') +
geom_line(aes(x = training_set$YearsExperience, y = predict(regressor, newdata = training_set)),
colour = 'blue') +
ggtitle('Salary vs Experience (Test set)') +
xlab('Years of experience') +
ylab('Salary')
Y = B0 + B1 X1 + B2 X1^2 + B3 X1^3 + B4 X1^4 + E
# Importing the dataset
dataset = read.csv("DATA/Position_Salaries.csv")
dataset = dataset[2:3]
print(dataset)
## Level Salary
## 1 1 45000
## 2 2 50000
## 3 3 60000
## 4 4 80000
## 5 5 110000
## 6 6 150000
## 7 7 200000
## 8 8 300000
## 9 9 500000
## 10 10 1000000
In this example, we are not splitting the dataset, just because we just want to show the difference between the linear regression vs polynomial regression.
# Fitting Linear Regression to the dataset
lin_reg = lm(formula = Salary ~ .,
data = dataset)
summary(lin_reg)
##
## Call:
## lm(formula = Salary ~ ., data = dataset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -170818 -129720 -40379 65856 386545
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -195333 124790 -1.565 0.15615
## Level 80879 20112 4.021 0.00383 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 182700 on 8 degrees of freedom
## Multiple R-squared: 0.669, Adjusted R-squared: 0.6277
## F-statistic: 16.17 on 1 and 8 DF, p-value: 0.003833
# Fitting Polynomial Regression to the dataset
dataset$Level2 = dataset$Level^2
dataset$Level3 = dataset$Level^3
dataset$Level4 = dataset$Level^4
poly_reg = lm(formula = Salary ~ .,
data = dataset)
summary(poly_reg)
##
## Call:
## lm(formula = Salary ~ ., data = dataset)
##
## Residuals:
## 1 2 3 4 5 6 7 8 9 10
## -8357 18240 1358 -14633 -11725 6725 15997 10006 -28695 11084
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 184166.7 67768.0 2.718 0.04189 *
## Level -211002.3 76382.2 -2.762 0.03972 *
## Level2 94765.4 26454.2 3.582 0.01584 *
## Level3 -15463.3 3535.0 -4.374 0.00719 **
## Level4 890.2 159.8 5.570 0.00257 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20510 on 5 degrees of freedom
## Multiple R-squared: 0.9974, Adjusted R-squared: 0.9953
## F-statistic: 478.1 on 4 and 5 DF, p-value: 1.213e-06
# Visualising the Linear Regression results
# install.packages('ggplot2')
library(ggplot2)
ggplot() +
geom_point(aes(x = dataset$Level, y = dataset$Salary),
colour = 'red') +
geom_line(aes(x = dataset$Level, y = predict(lin_reg, newdata = dataset)),
colour = 'blue') +
ggtitle('Linear Regression') +
xlab('Level') +
ylab('Salary')
# Visualising the Polynomial Regression results
# install.packages('ggplot2')
library(ggplot2)
ggplot() +
geom_point(aes(x = dataset$Level, y = dataset$Salary),
colour = 'red') +
geom_line(aes(x = dataset$Level, y = predict(poly_reg, newdata = dataset)),
colour = 'blue') +
ggtitle('Polynomial Regression') +
xlab('Level') +
ylab('Salary')
# Importing the dataset
dataset = read.csv("DATA/Social_Network_Ads.csv")
dataset = dataset[3:5]
# Encoding the target feature as factor
dataset$Purchased = factor(dataset$Purchased, levels = c(0, 1))
head(dataset)
## Age EstimatedSalary Purchased
## 1 19 19000 0
## 2 35 20000 0
## 3 26 43000 0
## 4 27 57000 0
## 5 19 76000 0
## 6 27 58000 0
Splitting the dataset into the Training set and Test set
# install.packages('caTools')
library(caTools)
library(data.table)
set.seed(123)
split = sample.split(dataset$Purchased, SplitRatio = 0.75)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)
# Feature Scaling : Feature scaling is applied here on both the training and test sets as features could be on different scales. Indeed in some machine learning models, feature scaling is used to avoid some features to be dominated by others features in such a way that the dominated features are not considered in the machine learning models
training_set[-3] = scale(training_set[-3])
test_set[-3] = scale(test_set[-3])
# Echantillon d'apprentissage
head(training_set)
## Age EstimatedSalary Purchased
## 1 -1.7655475 -1.4733414 0
## 3 -1.0962966 -0.7883761 0
## 6 -1.0006894 -0.3602727 0
## 7 -1.0006894 0.3817730 0
## 8 -0.5226531 2.2654277 1
## 10 -0.2358313 -0.1604912 0
# Echantillon test
head(test_set)
## Age EstimatedSalary Purchased
## 2 -0.3041906 -1.5135434 0
## 4 -1.0599437 -0.3245603 0
## 5 -1.8156969 0.2859986 0
## 9 -1.2488820 -1.0957926 0
## 12 -1.1544129 -0.4852337 0
## 18 0.6405008 -1.3207353 1
Fitting Logistic Regression to the Training set
classifier = glm(formula = Purchased ~ .,
family = binomial,
data = training_set)
summary(classifier)
##
## Call:
## glm(formula = Purchased ~ ., family = binomial, data = training_set)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.0753 -0.5235 -0.1161 0.3224 2.3977
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.1923 0.2018 -5.908 3.47e-09 ***
## Age 2.6324 0.3461 7.606 2.83e-14 ***
## EstimatedSalary 1.3947 0.2326 5.996 2.03e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 390.89 on 299 degrees of freedom
## Residual deviance: 199.78 on 297 degrees of freedom
## AIC: 205.78
##
## Number of Fisher Scoring iterations: 6
Predicting the Test set results
prob_pred = predict(classifier, type = 'response', newdata = test_set[-3])
# inclure la courbe ROC afin de choisir où couper pour la probabilité
y_pred = ifelse(prob_pred > 0.5, 1, 0)
# Making the Confusion Matrix
cm = table(test_set[, 3], y_pred > 0.5)
print(cm)
##
## FALSE TRUE
## 0 57 7
## 1 10 26
On doit revoir la viz
# Visualising the Training set results
# Library « ElemStatLearn » got archived :
# go to : https://cran.r-project.org/src/contrib/Archive/ElemStatLearn/
# download latest version
# in Rstudio, go to « tools » > « install packages »
# library(ElemStatLearn)
set = training_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
prob_set = predict(classifier, type = 'response', newdata = grid_set)
y_grid = ifelse(prob_set > 0.5, 1, 0)
plot(set[, -3],
main = 'Logistic Regression (Training set)',
xlab = 'Age', ylab = 'Estimated Salary',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))
# Visualising the Test set results
# library(ElemStatLearn)
set = test_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
prob_set = predict(classifier, type = 'response', newdata = grid_set)
y_grid = ifelse(prob_set > 0.5, 1, 0)
plot(set[, -3],
main = 'Logistic Regression (Test set)',
xlab = 'Age', ylab = 'Estimated Salary',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))
Variables
j pour le pays d’origine et t pour le mois entre 2007-2019
Variable binaire :
library(dplyr)
library(gravity)
library(caret)
library(tidyverse)
library(ggplot2)
library(plotly)
library(reshape2)
library(zoo)
library(gridExtra)
library(lmtest)
#importation donnees panel
df <- read.csv("DATA/data_gravity.csv", sep=";", header = TRUE )
df$period <- as.yearmon(df$period,format="%m/%Y")
graph1<- ggplot(data = df[df$id=="USA"|df$id=="Italie"|df$id=="Japon"
|df$id=="France",],
aes(x = period, y = nb_tourism, group = id, colour = id)) +
geom_line()+
labs(title = "Fréquentation Touristiques",
subtitle = "Données ISPF, 15 avril 2021",
y = "nb de touriste", x = "date")
#ggplotly()
graph2 <- ggplot(data = df[df$id=="USA"|df$id=="Italie"|df$id=="Japon"
|df$id=="France",],
aes(x = period, y = price, group = id, colour = id)) +
geom_line()+
labs(title = "Indices des cours des actions",
subtitle = "Données OECD, 25 avril 2021",
y = "Share prices", x = "date")
graph3 <- ggplot(data = df[df$id=="USA"|df$id=="Italie"|df$id=="Japon"
|df$id=="France",],
aes(x = period, y = dist_oil, group = id, colour = id)) +
geom_line()+
labs(title = "Distance * brent_j",
subtitle = "Données Brent, 15 avril 2021",
y = "", x = "date")
#ggplotly()
grid.arrange(graph1, graph2,graph3,
ncol=1, nrow=3)
graph1<- ggplot(data = df[df$id=="USA"|df$id=="Italie"|df$id=="Japon"
|df$id=="France",],
aes(x = period, y = nb_tourism, group = id, colour = id)) +
geom_line()+
labs(title = "Fréquentation Touristiques",
subtitle = "Données ISPF, 15 avril 2021",
y = "nb de touriste", x = "date")
ggplotly()
#---modele MCO simple
fit_mco <- lm(log(nb_tourism+1)~ log(dist_oil)+log(price)+Tx_change+
as.factor(mois)+ europe+asia+america+
hub,data=df)
summary(fit_mco)
##
## Call:
## lm(formula = log(nb_tourism + 1) ~ log(dist_oil) + log(price) +
## Tx_change + as.factor(mois) + europe + asia + america + hub,
## data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.3849 -0.7076 -0.0506 0.7034 4.1343
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.821909 0.823585 4.641 3.58e-06 ***
## log(dist_oil) 0.148586 0.066023 2.251 0.024465 *
## log(price) 0.199995 0.067169 2.977 0.002922 **
## Tx_change -0.229469 0.146453 -1.567 0.117224
## as.factor(mois)2 -0.004123 0.087526 -0.047 0.962428
## as.factor(mois)3 0.044781 0.087646 0.511 0.609430
## as.factor(mois)4 0.029811 0.087782 0.340 0.734169
## as.factor(mois)5 0.032598 0.087874 0.371 0.710684
## as.factor(mois)6 0.059903 0.087939 0.681 0.495785
## as.factor(mois)7 0.322034 0.087864 3.665 0.000250 ***
## as.factor(mois)8 0.134308 0.087715 1.531 0.125797
## as.factor(mois)9 0.229918 0.087785 2.619 0.008847 **
## as.factor(mois)10 0.314954 0.087882 3.584 0.000342 ***
## as.factor(mois)11 0.162616 0.087638 1.856 0.063587 .
## as.factor(mois)12 0.137751 0.087585 1.573 0.115846
## europe -2.423336 0.097344 -24.895 < 2e-16 ***
## asia -2.925646 0.091325 -32.036 < 2e-16 ***
## america -1.375575 0.082726 -16.628 < 2e-16 ***
## hub 3.581755 0.059757 59.938 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.18 on 4349 degrees of freedom
## Multiple R-squared: 0.5536, Adjusted R-squared: 0.5517
## F-statistic: 299.6 on 18 and 4349 DF, p-value: < 2.2e-16
#Vérification des hypothèses statistiques
#Si les résidus suivent une loi normal
shapiro.test(resid(fit_mco)) #p-value < 5% on accepte la normalité des residus
##
## Shapiro-Wilk normality test
##
## data: resid(fit_mco)
## W = 0.99234, p-value = 1.39e-14
#test de RAMSEY --> RESET : consiste à vérifier si le modèle sélectionné est linéaire ou non
reset(fit_mco)
##
## RESET test
##
## data: fit_mco
## RESET = 9.4814, df1 = 2, df2 = 4347, p-value = 7.784e-05
#p_value 7.801e-05<0.05 le test de linearite est valide
#verifier l'hypothèse d'homoscédacticité
bptest(fit_mco) #pvalue<0.05
##
## studentized Breusch-Pagan test
##
## data: fit_mco
## BP = 498.16, df = 18, p-value < 2.2e-16
#Refus de l’hypothèse d’homoscédacticité des résidus au seuil de risque de 5%
#----
#---- Modele de gravite ----
#Poisson pseudo maximum likelihood W/ fixed effects
#modele de gravite avec ppml avec regression sous forme log-log
fit <- ppml(
dependent_variable = "lnb_tourism",
distance = "dist_oil",
additional_regressors = c("lprice","Tx_change","europe","asia",
"america","hub","french_bee","expedia",
"fev","mar","avr","mai","jui","juil",
"aou","sep","oct","nov","dec"),
data = df
)
summary(fit)
##
## Call:
## y_ppml ~ dist_log + lprice + Tx_change + europe + asia + america +
## hub + french_bee + expedia + fev + mar + avr + mai + jui +
## juil + aou + sep + oct + nov + dec
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.94718 -0.34652 -0.00908 0.33007 1.85648
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.3384556 0.1882270 7.111 1.34e-12 ***
## dist_log 0.0289085 0.0148115 1.952 0.051031 .
## lprice 0.0401641 0.0173177 2.319 0.020427 *
## Tx_change -0.0403766 0.0402402 -1.003 0.315728
## europe -0.4627869 0.0206298 -22.433 < 2e-16 ***
## asia -0.5742328 0.0193732 -29.641 < 2e-16 ***
## america -0.2633078 0.0166032 -15.859 < 2e-16 ***
## hub 0.6104078 0.0112029 54.487 < 2e-16 ***
## french_bee 0.0297984 0.0140130 2.126 0.033519 *
## expedia -0.0040554 0.0120557 -0.336 0.736592
## fev -0.0006656 0.0200444 -0.033 0.973511
## mar 0.0099578 0.0200117 0.498 0.618791
## avr 0.0068489 0.0200488 0.342 0.732661
## mai 0.0053984 0.0201048 0.269 0.788317
## jui 0.0113721 0.0200673 0.567 0.570949
## juil 0.0654098 0.0198007 3.303 0.000963 ***
## aou 0.0268497 0.0199661 1.345 0.178770
## sep 0.0467012 0.0198719 2.350 0.018812 *
## oct 0.0656808 0.0198526 3.308 0.000946 ***
## nov 0.0347782 0.0198876 1.749 0.080406 .
## dec 0.0295773 0.0199126 1.485 0.137522
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasipoisson family taken to be 0.3381264)
##
## Null deviance: 3007.6 on 4367 degrees of freedom
## Residual deviance: 1603.4 on 4347 degrees of freedom
## AIC: NA
##
## Number of Fisher Scoring iterations: 4
PLSR : Partial Least Square Regression
https://rchailan.github.io/assets/lectures/anamultidim/tp3-pcr-pls.html
Base Dialyse.xls (5 317 patients), analyse de la survie de patients atteints d’insuffisance rénale traités par hémodialyse
library(readxl)
library(survival)
path_base <- "DATA/Dialyse.xls"
Dialyse <- read_excel(path_base)
BASE <- Dialyse
BASE$Maladie1 <- as.factor(BASE$Maladie)
BASE$Centre <- as.factor(BASE$Centre)
BASE$Age <- as.factor(BASE$Age)
BASE$Homme <- factor(BASE$Homme,labels=c("Femme","Homme"))
summary(BASE)
## Patient Status Temps Maladie Centre
## Min. : 1 Min. :0.000 Min. : 1.00 Length:5317 A: 846
## 1st Qu.:1330 1st Qu.:0.000 1st Qu.: 4.00 Class :character B:1604
## Median :2659 Median :0.000 Median :11.00 Mode :character C:2867
## Mean :2659 Mean :0.239 Mean :14.39
## 3rd Qu.:3988 3rd Qu.:0.000 3rd Qu.:23.00
## Max. :5317 Max. :1.000 Max. :44.00
## Age Homme Maladie1
## [25-50[ :1999 Femme:2647 diabete :1251
## [50-70[ :2303 Homme:2670 hypertension:2693
## Moins de 25: 251 renale :1373
## Plus de 70 : 764
##
##
library(ggplot2)
library(ggfortify)
fit<-survfit(Surv(Temps,Status)~1,data=BASE)
# summary(fit)
autoplot(fit) +
ggtitle("Courbe de survie") +
xlab("Durée")+
ylab("Probabilité de survie") +
geom_line(aes(y = 0.5), col=6) +
geom_line(aes(y = 0.75), col=5)
fit1<-survfit(Surv(Temps,Status)~Age,data=BASE)
# summary(fit1)
autoplot(fit1) +
ggtitle("Courbe de survie en fonction de l'âge") +
xlab("Durée")+
ylab("Probabilité de survie")
fit2<-survfit(Surv(Temps,Status)~Centre,data=BASE)
# summary(fit2)
autoplot(fit2) +
ggtitle("Courbe de survie en fonction des centres") +
xlab("Durée")+
ylab("Probabilité de survie")
fit3<-survfit(Surv(Temps,Status)~Homme,data=BASE)
# summary(fit3)
autoplot(fit3) +
ggtitle("Courbe de survie en fonction des genres") +
xlab("Durée")+
ylab("Probabilité de survie")
fit4<-survfit(Surv(Temps,Status)~Maladie,data=BASE)
# summary(fit4)
autoplot(fit4) +
ggtitle("Courbe de survie en fonction des maladies") +
xlab("Durée")+
ylab("Probabilité de survie")
Il existe une différence significative entre les groupes des trois variables ci dessous sauf pour la variable Homme.
diff = survdiff(Surv(Temps,Status)~Age, data= BASE)
pchisq(diff$chisq, length(diff$n)-1, lower.tail = FALSE)
## [1] 9.19213e-74
diff1 = survdiff(Surv(Temps,Status)~Centre, data= BASE)
pchisq(diff1$chisq, length(diff1$n)-1, lower.tail = FALSE)
## [1] 1.547109e-05
diff2 = survdiff(Surv(Temps,Status)~Maladie, data= BASE)
pchisq(diff2$chisq, length(diff2$n)-1, lower.tail = FALSE)
## [1] 1.812106e-13
diff2 = survdiff(Surv(Temps,Status)~Homme, data= BASE)
pchisq(diff2$chisq, length(diff2$n)-1, lower.tail = FALSE)
## [1] 0.4009333
#choisir les criteres les plus faibles
cfit1 <- coxph(Surv(Temps,Status) ~ Centre + Age + Maladie, data=BASE)
result.step <- step(cfit1,scope=list(upper = ~Centre + Age + Maladie, lower = ~1))
## Start: AIC=20029.81
## Surv(Temps, Status) ~ Centre + Age + Maladie
##
## Df AIC
## <none> 20030
## - Maladie 2 20055
## - Centre 2 20063
## - Age 3 20332
summary(cfit1)
## Call:
## coxph(formula = Surv(Temps, Status) ~ Centre + Age + Maladie,
## data = BASE)
##
## n= 5317, number of events= 1271
##
## coef exp(coef) se(coef) z Pr(>|z|)
## CentreB -0.47756 0.62030 0.08542 -5.591 2.26e-08 ***
## CentreC -0.15978 0.85233 0.07584 -2.107 0.035121 *
## Age[50-70[ 0.74908 2.11504 0.07198 10.407 < 2e-16 ***
## AgeMoins de 25 -0.41346 0.66136 0.21336 -1.938 0.052645 .
## AgePlus de 70 1.36251 3.90599 0.08248 16.519 < 2e-16 ***
## Maladiehypertension -0.36576 0.69367 0.06674 -5.480 4.25e-08 ***
## Maladierenale -0.28329 0.75330 0.08012 -3.536 0.000407 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## CentreB 0.6203 1.6121 0.5247 0.7333
## CentreC 0.8523 1.1733 0.7346 0.9889
## Age[50-70[ 2.1150 0.4728 1.8368 2.4355
## AgeMoins de 25 0.6614 1.5120 0.4353 1.0047
## AgePlus de 70 3.9060 0.2560 3.3229 4.5913
## Maladiehypertension 0.6937 1.4416 0.6086 0.7906
## Maladierenale 0.7533 1.3275 0.6438 0.8814
##
## Concordance= 0.653 (se = 0.008 )
## Likelihood ratio test= 388.3 on 7 df, p=<2e-16
## Wald test = 372.4 on 7 df, p=<2e-16
## Score (logrank) test = 408.5 on 7 df, p=<2e-16
Il y a une variable significative au seuil de 5% Age moins de 25 Le fait que le patient est âgée de plus de 70, la prob de deces d’un facteur de 3.9, age entre 25 et 50 ans, soit une augmentation de 2,9 fois
meme deduction pour les personnes age entre 50-70 ans
test de concordance % de paires dans la base où les observations avec le temps de survie le plus élevé ont la plus grande probabilité de survie prédite par le modèle
test de ratio de vraisemblance
#Verification H des risque de proportionnels
(res.zph1 <- cox.zph(cfit1))
## chisq df p
## Centre 9.38 2 0.0092
## Age 7.06 3 0.0701
## Maladie 3.73 2 0.1553
## GLOBAL 17.96 7 0.0121
P_value > 5% CentreB, Moins de 25, Plus de 70, Hypertension, renale On peut stratifier avec les centre malgre que l’un des deux centre nest pas signi dans le modele
p < 5% CentreC, Age[50-70[ ,
Si l’hypothèse des risques proportionnels n’est pas vérifiée pour une variable du modèle il est possible de stratifier le risque de base par rapport à cette variable
# choisir les variables avec p_value > 5% :
# CentreB, Moins de 25, Plus de 70, Hypertension, renale
cfit2 <- coxph(Surv(Temps,Status) ~ Age+ Maladie + strata(Centre) , data=BASE)
summary(cfit2)
## Call:
## coxph(formula = Surv(Temps, Status) ~ Age + Maladie + strata(Centre),
## data = BASE)
##
## n= 5317, number of events= 1271
##
## coef exp(coef) se(coef) z Pr(>|z|)
## Age[50-70[ 0.75134 2.11985 0.07200 10.436 < 2e-16 ***
## AgeMoins de 25 -0.41323 0.66151 0.21340 -1.936 0.052814 .
## AgePlus de 70 1.35612 3.88111 0.08249 16.440 < 2e-16 ***
## Maladiehypertension -0.36592 0.69356 0.06674 -5.482 4.2e-08 ***
## Maladierenale -0.28102 0.75501 0.08017 -3.505 0.000456 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## Age[50-70[ 2.1198 0.4717 1.8409 2.4411
## AgeMoins de 25 0.6615 1.5117 0.4354 1.0050
## AgePlus de 70 3.8811 0.2577 3.3017 4.5622
## Maladiehypertension 0.6936 1.4418 0.6085 0.7905
## Maladierenale 0.7550 1.3245 0.6452 0.8835
##
## Concordance= 0.651 (se = 0.009 )
## Likelihood ratio test= 364.4 on 5 df, p=<2e-16
## Wald test = 348.4 on 5 df, p=<2e-16
## Score (logrank) test = 386.3 on 5 df, p=<2e-16
logique dans le test, la majorité ont les mêmes caractéristiques par rapport à l’age et les maladies dans les centre de traitement voir si les p_value sont supérieur a 5% et on remarque qu’en on stratifie avec le centre, les p_value sont supérieur a 5% donc on accepte l’H Hazard proportionnel au seuil de 5%
cfit3 <- coxph(Surv(Temps,Status) ~ Age+ Centre + strata(Maladie) , data=BASE)
str(BASE)
## tibble[,8] [5,317 x 8] (S3: tbl_df/tbl/data.frame)
## $ Patient : num [1:5317] 1 2 3 4 5 6 7 8 9 10 ...
## $ Status : num [1:5317] 0 0 0 1 0 0 0 0 0 0 ...
## $ Temps : num [1:5317] 16 11 35 8 3 25 3 42 8 3 ...
## $ Maladie : chr [1:5317] "hypertension" "renale" "hypertension" "hypertension" ...
## $ Centre : Factor w/ 3 levels "A","B","C": 1 1 1 1 1 1 1 1 1 1 ...
## $ Age : Factor w/ 4 levels "[25-50[","[50-70[",..: 1 1 2 2 4 2 1 2 1 1 ...
## $ Homme : Factor w/ 2 levels "Femme","Homme": 1 1 1 2 2 2 2 2 2 1 ...
## $ Maladie1: Factor w/ 3 levels "diabete","hypertension",..: 2 3 2 2 3 2 2 1 1 3 ...
# on interprete les variables par rapport a la variable de reference
(res.zph <- cox.zph(cfit2))
## chisq df p
## Age 7.35 3 0.062
## Maladie 3.24 2 0.198
## GLOBAL 8.87 5 0.114
(res.zph <- cox.zph(cfit3))
## chisq df p
## Age 5.19 3 0.158
## Centre 8.32 2 0.016
## GLOBAL 13.88 5 0.016
avec stratifier pour la variables Centre, pour les pourcentage Le fait que l’ind est meme conclusion que celui d’avant ```